This notebook is intended to provide examples of how Hierarchical (Multilevel / Mixed Effects) Models induce estimate shrinkage via partial-pooling (i.e. effect is similar to regularization) for nested data.

The original dataset is related to website bounce rates and can be found Kaggle notebook here.

The overall goal is to understand how does the average bounce time in a website change with respect to age across counties.

Note: For simplicity we’re ingnoring the fact that each county in the dataset contains different location. If you’d like to consider the effect of those locations you can use the lmer package to fit a cross-nested model structure.

Load Packages

require(tidyverse)
require(dplyr) # dplyr 1.0.0
require(knitr)
require(kableExtra)
require(broom)

# Animation library
require(gganimate)
require(gifski)
require(png)

# Mixed Effects Model libraries
require(lme4)
require(brms)
require(tidybayes)
require(arm)
require(bayesplot)

# Prior libraries
require(extraDistr)

Load Data

We first load the data. I’ve simulated data based on the original data vary the group sizes so that some have few data points and others have more points. Think about this a scenario where some data was corrupted and you can only use a subset of the data points from a county.

It will also help illustrate partial-pooling and how Bayesian methods can help in later sections.

bounce_data <- read_csv("../data/bounce_rates_sim.csv", col_types = cols() ) #cols() suppresses messages

kable(bounce_data) %>% 
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  row_spec(0, background = "#4CAF50", color="#FFF")  %>% 
  scroll_box(width = "100%", height = "200px") 
bounce_time age county
225.8538 54.0338976 kent
217.9062 78.8297131 kent
220.6601 70.8424269 kent
223.4206 46.5553691 kent
213.8878 93.3078586 kent
206.4577 63.5873121 kent
234.4551 91.4863822 kent
203.5949 37.4415790 essex
203.6985 28.0789762 essex
221.7954 36.2951076 essex
212.3589 30.6550948 essex
215.5649 51.4444536 essex
201.6897 52.6734190 essex
205.5136 28.6809948 essex
211.1856 27.2302838 essex
184.2293 38.6716959 essex
203.6672 32.5243762 essex
210.3584 61.6451345 essex
211.9895 27.5876291 essex
208.9214 52.1319790 essex
213.3661 55.0352427 essex
212.4840 14.9024549 essex
201.7498 61.2564971 essex
221.7837 37.7669958 essex
209.9975 39.9692857 essex
205.7030 41.8025436 essex
196.0435 31.7530121 essex
216.0141 54.7360097 essex
198.6938 58.6835402 essex
184.5071 29.9833313 london
182.5263 24.1723319 london
180.2908 13.3496773 london
176.6328 25.6219742 london
172.5186 47.9649775 london
179.8916 0.6166381 london
161.4922 30.6829851 london
182.0156 36.1165353 london
194.6150 20.8453076 london
183.0910 53.0520379 london
184.4475 36.7119649 london
165.2503 12.0017953 london
181.7137 40.0946351 london
191.3859 25.3440220 london
171.8971 32.7839521 london
190.2567 36.2677987 london
184.4317 27.0373561 london
168.7719 13.8041686 london
178.9858 40.4456062 london
192.2939 14.6547403 london
178.4500 17.5377370 london
182.4974 16.2238708 london
182.3821 14.1194901 london
171.8979 13.9636521 london
165.9008 18.1099879 london
167.4110 58.8334292 london
185.5440 27.9782398 london
191.1776 28.4374256 london
189.3372 21.2460489 london
180.2835 31.8813689 london
172.2707 15.5581492 london
183.3813 31.0896525 london
195.2366 21.1088488 london
176.8045 6.1784167 london
170.7791 3.6813896 london
159.2062 20.7514884 london
170.3089 38.4134068 london
198.6933 48.1308804 devon
203.9536 69.5482854 devon
204.3526 70.7433714 devon
201.6434 87.9492766 devon
195.3188 110.0081789 devon
204.4924 72.9631288 devon
194.3613 71.9816568 devon
200.4898 87.7740291 devon
197.8712 84.8171529 devon
210.8970 80.3663507 devon
200.8695 83.3611431 devon
191.5367 93.6559102 devon
204.6859 56.1122650 devon
196.3898 66.7515070 devon
197.5210 78.7099304 devon
201.0421 67.3001997 devon
192.0654 26.1945308 devon
199.7605 57.4251312 devon
201.8048 86.3658944 devon
205.8579 73.4961535 devon
199.2470 72.0273560 devon
204.1690 103.5828629 devon
194.4276 71.5204216 devon
191.3841 66.5242650 devon
200.4391 80.2278495 devon
198.0288 29.1018606 devon
203.1436 57.0818017 devon
202.7748 71.3523719 devon
188.1232 88.0390629 devon
202.9197 93.7810972 devon
200.3105 67.9941480 devon
196.7775 69.6428531 devon
199.1221 64.1628275 devon
208.8129 76.3083359 devon
196.6335 74.2495564 devon
203.9025 96.6197603 devon
197.6037 73.2726791 devon
193.3032 69.3332928 devon
204.5152 80.3242278 devon
197.2597 84.4688060 devon
191.6854 55.4917210 devon
197.0951 58.1000094 devon
200.3260 106.3686455 devon
197.5353 43.4890796 devon
200.0882 63.9486839 devon
200.6170 66.8028078 devon
202.1392 79.3471172 devon
198.7077 75.2720741 devon
193.9549 43.7711217 devon
198.8015 66.1891047 devon
194.9967 80.5763924 devon
204.3312 77.3732602 devon
198.1364 86.1894137 devon
207.7474 81.6932305 devon
203.1465 63.5183996 devon
190.3096 84.4685811 devon
202.2230 91.5242916 devon
192.6323 68.8969240 devon
198.9416 58.9136619 devon
193.7122 70.1782239 devon
202.2707 79.4870632 devon
205.0753 113.9620971 devon
201.8092 88.8432740 devon
200.6674 75.5415878 devon
200.3422 62.7009176 devon
199.9825 103.0134347 devon
196.7411 62.4373976 devon
192.8372 78.5811108 devon
200.0539 84.3281631 devon
197.2865 100.5050568 devon
195.5131 78.4693243 devon
193.6063 60.6152536 devon
202.5980 87.1698923 devon
199.4300 62.3860521 devon
203.6946 81.5990143 devon
215.4648 113.0031062 dorset
200.2166 75.7654139 dorset
209.1769 56.9425849 dorset
175.9648 44.5681276 dorset
207.0599 80.9264892 dorset
190.7460 64.4317310 dorset
207.5366 75.7529361 dorset
212.6425 58.9790448 dorset
204.9520 83.6773553 dorset
209.0750 61.1754989 dorset
208.5712 43.9840365 dorset
203.9592 47.6253547 dorset
202.8222 52.4668764 dorset
211.2343 68.8392407 dorset
206.1818 64.1178333 dorset
194.8650 80.4965703 dorset
191.1600 48.5413880 dorset
199.9447 84.4747858 dorset
194.6530 28.8822625 dorset
191.9744 58.0842603 dorset
200.1989 64.8671015 dorset
209.0733 65.6065653 dorset
206.3179 44.0892739 dorset
185.9619 84.3928058 dorset
200.3820 81.7615930 dorset
193.6978 69.5529085 dorset
207.5257 61.0562947 dorset
195.5070 44.6490158 dorset
201.1868 90.5382608 dorset
201.5488 53.2478347 dorset
179.5727 54.0722800 dorset
218.2631 34.1546385 dorset
195.9146 60.4473141 dorset
199.6272 62.5265294 dorset
215.8019 96.4099257 dorset
182.6826 68.3610275 dorset
190.9447 70.3242604 dorset
195.9754 49.7787309 dorset
202.3283 40.9970438 dorset
201.4353 52.0458249 dorset
207.9298 68.3298135 dorset
210.2485 65.0849081 dorset
208.0484 47.0672040 dorset
195.9026 66.9701164 dorset
203.2758 51.6597917 dorset
193.9061 82.1474241 dorset
204.0604 63.0726315 dorset
205.3292 69.9914040 dorset
200.8077 61.8704037 dorset
195.2892 67.1641134 dorset
204.6368 64.9481059 dorset
202.5677 36.2135523 dorset
204.1111 60.4343694 dorset
203.9212 77.6145159 dorset
198.0681 46.6977227 dorset
199.2772 85.4822272 dorset
215.9674 63.5528864 dorset
198.2211 53.9920465 dorset
198.1251 69.1104677 dorset
211.6341 51.6930641 dorset
180.5467 40.1163216 dorset
200.3906 72.5353939 dorset
198.6198 60.7617568 dorset
209.8897 100.7767115 dorset
205.2606 81.6769185 dorset
222.4502 83.4098727 dorset
195.4427 60.1861572 dorset
196.0813 63.1445017 dorset
192.5134 78.1475734 dorset
217.9686 61.7855966 dorset
188.6774 41.2520223 dorset
188.7936 45.7480490 dorset
212.7729 51.3392792 dorset
202.8989 50.3114777 dorset
193.9978 39.4420012 dorset
200.7135 62.0385368 dorset
223.7702 83.3403105 dorset
194.9929 59.4222138 dorset
213.5304 39.9001758 dorset
203.4709 78.6441192 dorset
198.8800 90.6587434 dorset
205.5893 93.7301631 dorset
208.4568 79.6189482 dorset
191.7804 45.6014434 dorset
207.1614 73.6689730 dorset
183.9436 85.3543581 dorset
200.5705 65.7319394 dorset
226.9220 41.2668098 dorset
213.9588 80.8691816 dorset
203.7940 66.1592149 dorset
208.4021 68.1035747 cumbria
213.9375 71.1628039 cumbria
200.1463 71.2777877 cumbria
210.3203 66.3775058 cumbria
211.1401 72.7921117 cumbria
198.6172 67.5843731 cumbria
200.4856 72.8991625 cumbria
203.4624 46.4780895 cumbria
209.8694 68.2913066 cumbria
212.5938 13.3498169 cumbria
213.3855 68.6225103 cumbria
218.4301 46.2146239 cumbria
201.3730 49.4553456 cumbria
207.0558 50.6788473 cumbria
201.8000 74.2475422 cumbria
207.5178 73.4160696 cumbria
214.5728 70.4031623 cumbria
199.4696 67.3071122 cumbria
210.4172 63.6487516 cumbria
204.2227 69.8016918 cumbria
205.6621 74.9576590 cumbria
218.3393 51.5284157 cumbria
217.8684 75.3709309 cumbria
207.4168 87.2606490 cumbria
191.7129 34.6233057 cumbria
212.3685 35.0192346 cumbria
207.7391 53.3175083 cumbria
202.3393 31.2947156 cumbria
207.6240 60.3005852 cumbria
211.1069 49.2604887 cumbria
216.9618 60.3164348 cumbria
204.8985 40.8687095 cumbria
208.0298 70.0610490 cumbria
208.5042 63.1141711 cumbria
202.6250 51.5532537 cumbria
209.0654 71.1933170 cumbria
207.5364 66.2010990 cumbria
204.8710 66.4105841 cumbria
206.6085 44.6009728 cumbria
215.9537 62.3765923 cumbria
204.0067 52.0465730 cumbria
204.1681 69.9172135 cumbria
205.6398 99.0975693 cumbria
215.0105 61.0839176 cumbria
222.2287 60.2920212 cumbria
211.5478 93.1951370 cumbria
201.4563 34.9128639 cumbria
218.9502 45.8439897 cumbria
216.5111 128.7671202 cumbria
205.6643 31.7539260 cumbria
208.7487 47.6679717 cumbria
207.6356 75.2344626 cumbria
210.5432 78.5386609 cumbria
212.7470 60.7260834 cumbria
203.2712 49.1324334 cumbria
210.0344 82.7130053 cumbria
204.5591 42.3441804 cumbria
202.7044 43.6140668 cumbria
209.7123 32.7611145 cumbria
212.2624 74.0526820 cumbria
212.7655 65.2303318 cumbria
211.8085 62.8312716 cumbria
196.2341 46.1165224 cumbria
204.3112 48.1297188 cumbria
212.9128 47.6767236 cumbria
212.4191 49.2887500 cumbria
201.5805 67.1776235 cumbria
208.7316 47.1176613 cumbria
208.1921 77.3301174 cumbria
220.7695 46.8078264 cumbria
214.4266 52.5565007 cumbria
200.8323 68.5553601 cumbria
205.0210 48.8278650 cumbria
208.2647 50.2101166 cumbria
208.0480 42.7070270 cumbria
211.5435 68.8255436 cumbria
203.6096 40.7169313 cumbria
217.7464 92.2924582 cumbria
209.0103 57.3339810 cumbria
208.1187 48.5941368 cumbria
211.3844 57.8649992 cumbria
216.8920 64.7417808 cumbria
212.9242 72.1575443 cumbria
214.4708 54.2937216 cumbria
208.0446 83.3320376 cumbria
210.9401 59.8753501 cumbria
206.6836 68.7556004 cumbria
212.0873 67.9693555 cumbria
214.7157 49.8832742 cumbria
211.5558 70.3679192 cumbria
206.0166 35.7850225 cumbria
196.2511 78.8856796 cumbria
203.4005 58.3988318 cumbria
209.8707 48.1078972 cumbria
217.3593 58.1306070 cumbria
211.5424 75.1636276 cumbria
214.0327 69.9658272 cumbria
208.8590 72.6652255 cumbria
201.8902 71.5765933 cumbria
205.3669 75.1597126 cumbria
215.8747 85.9165965 cumbria
204.0299 61.9563821 cumbria
205.2632 52.8376082 cumbria
210.2801 38.8162180 cumbria
221.1225 71.7880492 cumbria
204.9159 47.9188851 cumbria
215.3467 74.8866790 cumbria
206.4351 63.0405642 cumbria
204.9950 76.0904226 cumbria
209.5784 47.7070588 cumbria
210.9453 57.3677331 cumbria
206.6750 46.0450899 cumbria
177.6245 30.3243892 norfolk
182.1104 59.7624135 norfolk
175.4903 24.3424640 norfolk
177.1250 3.4555463 norfolk
185.6625 42.8386175 norfolk
182.3939 34.2396776 norfolk
186.2690 42.8054714 norfolk
156.0129 25.1805297 norfolk
182.7219 33.4428024 norfolk
170.3454 16.3007618 norfolk
177.1850 23.1601323 norfolk
189.1116 16.6417034 norfolk
177.1237 31.0178915 norfolk
184.0938 26.4600152 norfolk
178.4260 10.2024442 norfolk
174.4067 9.9296068 norfolk
183.0203 -1.3688121 norfolk
178.3646 47.6362663 norfolk
170.4670 33.1970090 norfolk
178.2488 36.7458759 norfolk
174.0448 51.4282092 norfolk
166.9864 48.3338420 norfolk
185.9389 38.9343453 norfolk
183.6586 12.8125019 norfolk
167.9151 32.8333286 norfolk
181.7819 27.7025989 norfolk
177.6393 18.9172124 norfolk
190.4892 42.0950792 norfolk
189.3262 23.8045842 norfolk
175.1466 32.6076316 norfolk
172.5747 40.8840877 norfolk
179.4212 36.3111909 norfolk
177.7364 14.8073732 norfolk
195.6366 59.6851980 norfolk
188.7748 21.6540076 norfolk
173.7379 40.2931054 norfolk
187.2555 33.7627070 norfolk
186.4125 38.8512377 norfolk
174.1933 40.0978380 norfolk
172.5385 34.3713081 norfolk
179.8547 23.3687044 norfolk
184.6571 28.0914785 norfolk
177.3976 39.3373139 norfolk
175.2949 45.2646859 norfolk
181.5672 33.3339050 norfolk
179.0471 50.7538823 norfolk
176.1024 20.3279934 norfolk
198.8694 19.4848379 norfolk
187.0505 26.2654827 norfolk
178.3498 23.0138017 norfolk
188.5443 51.2240337 norfolk
184.8811 24.7544349 norfolk
176.2062 23.1680538 norfolk
174.8356 42.8592904 norfolk
182.0219 26.5169495 norfolk
175.9168 8.9187931 norfolk
172.4202 61.3434005 norfolk
189.6861 45.5005994 norfolk
171.9307 22.6656666 norfolk
185.9969 17.0042500 norfolk
181.1680 10.3314696 norfolk
187.3300 14.3227392 norfolk
184.4892 30.7891176 norfolk
185.3808 26.1332192 norfolk
175.1346 53.3342564 norfolk
174.8861 44.7958652 norfolk
172.2547 28.0869203 norfolk
173.2351 37.6817821 norfolk
181.3234 49.6138718 norfolk
167.4591 18.2776405 norfolk
198.7599 31.4106927 norfolk
180.9043 36.1373385 norfolk
179.2687 47.7732876 norfolk
191.1251 33.6304979 norfolk
183.8592 48.6561583 norfolk
186.4512 35.6241385 norfolk
172.4647 45.0611858 norfolk
185.5850 16.2535479 norfolk
190.6732 47.1206922 norfolk
181.4938 17.7815486 norfolk
175.3785 41.1253219 norfolk
186.9373 45.0087843 norfolk
176.1613 34.4553405 norfolk
168.1130 24.9561962 norfolk
176.6706 43.6596224 norfolk
183.8519 20.4473341 norfolk
199.1827 56.6103566 norfolk
177.7475 50.2797722 norfolk
183.4855 24.8496194 norfolk
193.8418 27.9935593 norfolk
161.0385 28.5824222 norfolk
157.2738 22.9418361 norfolk
177.4387 15.6077106 norfolk
170.8876 41.9406177 norfolk
187.2842 24.5350610 norfolk
173.5743 27.0215158 norfolk
184.4781 27.2440687 norfolk
187.9249 27.2687049 norfolk
180.6428 -0.7019617 norfolk
173.9960 39.5977737 norfolk
178.8904 47.3851863 norfolk
178.7011 18.7494730 norfolk
183.4040 34.2409402 norfolk
181.4545 44.7699153 norfolk
194.7127 29.5122811 norfolk
175.5028 64.0123130 norfolk
177.7311 36.5778049 norfolk
179.3548 0.9141365 norfolk
191.8206 27.6698795 norfolk
179.2707 37.5737531 norfolk
185.6427 47.3963036 norfolk
190.0737 24.7306772 norfolk
179.9949 29.5884747 norfolk
175.1075 20.4577557 norfolk
174.1200 45.2312772 norfolk
189.9190 20.1298323 norfolk
174.6957 11.9154419 norfolk
179.9953 21.0002359 norfolk
173.8736 20.4323262 norfolk
173.5177 13.4978599 norfolk
221.9283 38.4742167 cheshire
207.9079 45.4748607 cheshire
212.7421 41.8737247 cheshire
234.4364 26.3455989 cheshire
211.3112 6.0254376 cheshire
215.6437 31.1458104 cheshire
215.7583 30.0761878 cheshire
225.9816 13.3247270 cheshire
202.2175 47.4680502 cheshire
216.4464 54.3233425 cheshire
189.2030 12.4240670 cheshire
213.7356 26.5020247 cheshire
215.4297 27.7379429 cheshire
205.6262 41.0406345 cheshire
209.7567 14.9109774 cheshire
218.6768 53.3697633 cheshire
207.1210 40.3001202 cheshire
197.7208 52.3991091 cheshire
205.3591 15.4844801 cheshire
203.8642 30.0364060 cheshire
220.2534 46.9418857 cheshire
215.8066 63.1028203 cheshire
204.6500 33.0036767 cheshire
211.3456 23.4902948 cheshire
209.1033 46.4478498 cheshire
204.3843 57.2268843 cheshire
214.7387 52.9091833 cheshire
225.3257 33.4875911 cheshire
209.5644 42.0848245 cheshire
212.9073 34.0267763 cheshire
203.4897 20.3144413 cheshire
218.3658 45.0602569 cheshire
209.7521 36.1162393 cheshire
199.4428 43.7809334 cheshire
197.7247 35.7615946 cheshire
201.4917 48.1162715 cheshire
216.7140 21.0056306 cheshire
210.2920 48.4392415 cheshire
227.7717 26.1951558 cheshire
199.7231 52.8623345 cheshire
212.3758 33.7169802 cheshire
218.3416 39.0152634 cheshire
216.1571 49.8785343 cheshire
203.6633 35.5441990 cheshire
220.4490 59.1754341 cheshire
213.7362 48.7871858 cheshire
192.0860 24.7542934 cheshire
219.5891 18.3244731 cheshire
215.3220 36.1208784 cheshire
217.1592 54.6491550 cheshire
229.1985 58.1240030 cheshire
197.4211 28.0008207 cheshire
194.6101 59.3781492 cheshire
215.6196 36.2725542 cheshire
210.6817 54.6376636 cheshire
198.6076 20.0896767 cheshire
217.9418 69.4445812 cheshire
212.8855 32.4565696 cheshire
211.8936 20.7247118 cheshire
224.2742 68.9384881 cheshire
212.0488 33.3224516 cheshire
207.6988 45.8317298 cheshire
210.7533 41.2070510 cheshire
199.7975 25.8801438 cheshire
199.2004 49.7437200 cheshire
213.7474 31.0755278 cheshire
232.6517 39.2014837 cheshire
214.5394 48.5109657 cheshire
227.1374 42.8325840 cheshire
223.6423 15.6686131 cheshire
220.2531 40.2536018 cheshire
204.0903 35.3317513 cheshire
216.6103 25.3493716 cheshire
204.3529 24.3093114 cheshire
202.9545 17.9462145 cheshire
220.6908 58.8813616 cheshire
197.4419 20.9556699 cheshire
219.2022 53.8666390 cheshire
200.6058 54.3719240 cheshire
203.2914 33.8482872 cheshire
221.4731 31.5800795 cheshire
191.6479 61.7359334 cheshire
214.5692 61.6148578 cheshire
217.1172 44.2591331 cheshire
219.7899 26.5915075 cheshire
209.0349 9.0573445 cheshire
194.6766 19.9806463 cheshire
212.7522 25.7547861 cheshire
207.9540 33.7243840 cheshire
220.2253 51.0265594 cheshire
202.2456 40.5811917 cheshire
210.2562 33.1727799 cheshire
210.7081 50.9965280 cheshire
213.7315 61.4273448 cheshire
213.1203 69.0570155 cheshire
200.2013 16.8741807 cheshire
219.4747 31.6274991 cheshire
196.3528 43.1793571 cheshire
218.0399 45.3262013 cheshire
198.7352 21.6516396 cheshire
226.9599 43.6253110 cheshire
199.0811 41.6476505 cheshire
217.5453 22.9160904 cheshire
215.1293 28.6230003 cheshire
215.4584 66.4014542 cheshire
211.2915 62.3075307 cheshire
210.7704 65.8584379 cheshire
218.8084 30.4536576 cheshire
198.0270 33.0769354 cheshire
220.4636 22.5261903 cheshire
213.6203 17.1198733 cheshire
211.6374 12.0066772 cheshire
201.6045 38.2147597 cheshire
204.0669 59.6731055 cheshire
198.6537 22.3421179 cheshire
205.1926 36.8860302 cheshire
208.2451 1.3834385 cheshire
202.8818 38.3636573 cheshire
217.4679 51.7275694 cheshire
205.9116 54.9958716 cheshire
209.3939 43.0117799 cheshire
212.2201 39.3275010 cheshire
220.3121 57.1293194 cheshire
209.2730 42.8702028 cheshire
222.7670 46.7792802 cheshire
213.5815 32.9050651 cheshire
225.5685 43.2560941 cheshire
212.8360 50.9297720 cheshire
218.4750 40.3135427 cheshire
201.4827 52.2098080 cheshire
220.6145 70.9991837 cheshire
212.0379 12.2597213 cheshire
205.4092 52.7721645 cheshire
233.1305 52.7686105 cheshire
192.8963 29.9640561 cheshire
203.7808 31.4121026 cheshire
207.6008 11.4867583 cheshire
209.9624 42.4685669 cheshire
210.4145 14.6512215 cheshire
219.7713 20.7662302 cheshire
212.7669 30.7058228 cheshire
210.4563 48.4524502 cheshire
196.7473 71.4371569 cheshire
221.8812 26.4152223 cheshire
222.3395 23.2133445 cheshire
232.8309 58.3377577 cheshire
223.0557 30.7829841 cheshire
207.5521 29.7307659 cheshire
210.3355 33.5451945 cheshire
216.5485 38.6243033 cheshire

We notice there aren’t any missing data, but we can center-scale the age variable. This is helpful when interpreting the model coefficients in our case (especially when dependent and independent have different scales, e.g. a population varible).

# Check distribution of data
summary(bounce_data)
##   bounce_time         age             county         
##  Min.   :156.0   Min.   : -1.369   Length:613        
##  1st Qu.:190.3   1st Qu.: 31.628   Class :character  
##  Median :202.6   Median : 47.677   Mode  :character  
##  Mean   :200.0   Mean   : 49.021                     
##  3rd Qu.:210.8   3rd Qu.: 65.858                     
##  Max.   :234.5   Max.   :128.767

Standardize (center-scale)

In this case, we primarily to avoid having the variance of our linear model estimates to be in different scales (i.e it provides some stability to our estimates). Next, if we didn’t standardize (center-scale) the data then the intercept would be interpreted as the “expected bounce rate when a person has 0 years”, which doesn’t make much sense. To fix this we standardize the age variable so we can interpret the effect of the variable as deviations from the mean age in the dataset (e.g. “an increase of 1 year increases the expected bounce rate by X amount”). The intercept here can be interpreted as the average age.

# Standardize data
bounce_data <- bounce_data %>% 
  mutate(std_age = scale(age)[,1]) %>% 
  relocate(std_age, .after=age)
  
# Example std_age data
summary(bounce_data)
##   bounce_time         age             std_age            county         
##  Min.   :156.0   Min.   : -1.369   Min.   :-2.24338   Length:613        
##  1st Qu.:190.3   1st Qu.: 31.628   1st Qu.:-0.77438   Class :character  
##  Median :202.6   Median : 47.677   Median :-0.05986   Mode  :character  
##  Mean   :200.0   Mean   : 49.021   Mean   : 0.00000                     
##  3rd Qu.:210.8   3rd Qu.: 65.858   3rd Qu.: 0.74960                     
##  Max.   :234.5   Max.   :128.767   Max.   : 3.55032

Complete Pooling (Simple Linear Regression)

A simple linear regression on this data would be considered a complete pooling scenario because we are grouping all data together and not considering the county grouping structure. We can fit this model with the lm functions.

complete_pool <- lm(bounce_time ~ std_age, 
                      data = bounce_data )

  
tidy(complete_pool)
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   200.       0.570    351.   0.      
## 2 std_age         4.69     0.570      8.21 1.29e-15
ggplot(bounce_data, aes(x=std_age, y=bounce_time, color=county)) +
  geom_point(alpha=0.5) +
  geom_abline(aes(intercept=coef(complete_pool)[1], slope=coef(complete_pool)[2])) +
  labs(x="Standardized Age", y="Bounce time", title="Simple linear regression (population trendline)") 

Diagnostics

Our linear regression models runs on the assumptiont that 1) data is normally distributed and 2) variance is constant. So we can evaluate our model’s fit using a residuals vs fitted plot below. If the assumptions are met, we won’t see any trend in the residuals in relation to the predicted values (i.e. they won’t increase/decrease with predicted values). However, in our case we see there is a trend which violates this assumption and indicates a lack of fit.

# Plot diagnostic plot
qplot(x= .fitted, y= .resid, data = complete_pool, alpha=0.8) +
  geom_smooth(se = FALSE, size=1) +
  labs(y="Observed - Predicted", x= "Predicted", 
      title="Complete pooling model shows slight lack of fit (variance isn't constant)") +
  theme(legend.position = "none",
        title = element_text(size=10))

Evaluation

We also compute the RMSE for this model to set a baseline to compare subsequent models. Overall, this isn’t too bad.

# RMSE Helper function
rmse <- function(ytrue, ypred){
  mse = mean((ytrue - ypred)^2)
  return (sqrt(mse))
}

# Predict on train set
y_hat <- predict(complete_pool)

kable(tibble(RMSE= rmse(bounce_data$bounce_time, y_hat))) %>% 
  row_spec(0, background = "#4CAF50", color="#FFF") %>% 
  kable_styling(full_width = FALSE, position = "left")
RMSE
14.08967

No Pooling (Individual County Linear Regressions)

Next let’s explore the scenario where there is no pooling (i.e. we consider each county invdividually and fit a linear model to each) to see if this improves fit and performance. First, lets make sure this makes sense and that each county has can be considered an individual group.

# Check if there's variability across groups
ggplot(bounce_data, aes(x=county , y=bounce_time, fill=county)) +
  geom_boxplot(alpha =0.7) +
  labs(x="County", y="Bounce Time", 
       title="Bounce times variance and means seem to be different across county groups") +
  theme(legend.position = "none")

(Note: this model throws out a warning regarding a “singular fit” which will be relevant in subsequent section and the Bayesian section. It simply means the model specified is too complex to fit given the data. This can happen when we try to estimate multiple random effects (e.g. random intercept AND slope) for groups with small sample sizes)

# No pool model fit (i.e. no fixed effects)
no_pool <- lmList(bounce_time ~ std_age|county, data = bounce_data)

summary(no_pool)
## Call:
##   Model: bounce_time ~ std_age | NULL 
##    Data: bounce_data 
## 
## Coefficients:
##    (Intercept) 
##          Estimate Std. Error   t value      Pr(>|t|)
## cheshire 212.2132  0.7937946 267.34020  0.000000e+00
## cumbria  208.1792  0.9301023 223.82396  0.000000e+00
## devon    197.7295  1.7177841 115.10732  0.000000e+00
## dorset   200.4056  1.1395653 175.86149  0.000000e+00
## essex    207.4982  2.0105927 103.20249  0.000000e+00
## kent     219.6813  5.0430739  43.56099 1.579812e-187
## london   179.6383  2.7072670  66.35413 1.072602e-277
## norfolk  180.5272  1.1901381 151.68594  0.000000e+00
##    std_age 
##            Estimate Std. Error    t value   Pr(>|t|)
## cheshire  1.4692787  0.9571419  1.5350689 0.12529668
## cumbria   1.2223448  1.0387782  1.1767139 0.23977866
## devon     1.3490906  1.2668644  1.0649053 0.28734928
## dorset    2.1612633  1.1443696  1.8886060 0.05942916
## essex    -0.6887502  2.9889972 -0.2304285 0.81783774
## kent      0.7038258  4.0887423  0.1721375 0.86338778
## london    0.4334903  2.2703109  0.1909387 0.84863851
## norfolk   0.5113911  1.1967166  0.4273285 0.66929418
## 
## Residual standard error: 7.973379 on 597 degrees of freedom

Diagnostics

It seems that adding individual trendlines seems to improve the model’s fit given we only see a minor trends on the right side of the plot.

# Plot diagnostic plot
no_pool_df <- tibble(fit = fitted(no_pool),
                     resid = residuals(no_pool))

qplot(x= fit, y= resid, data = no_pool_df, alpha=0.8) +
  geom_smooth(se = FALSE, size=1) +
  labs(y="Observed - Predicted", x= "Predicted", 
      title="No pooling model shows better fit (variance isn't heteroscedastic)") +
  theme(legend.position = "none")

Evaluation

This model improves the predictive performance compared to the complete pool model, which is expected if fit a regression line for each group individually.

# Predict on train set
y_hat_np <- predict(no_pool)

# Get RMSEs to compare
np_preds <- tibble(rmse = "RMSE",
                   complete_pool =rmse(bounce_data$bounce_time, y_hat),
                   no_pool= rmse(bounce_data$bounce_time, y_hat_np)) %>% 
  column_to_rownames(., var="rmse")
               
# Table
kable(np_preds, digits = 3) %>% 
  row_spec(0, background = "#4CAF50", color="#FFF") %>% 
  kable_styling(full_width = FALSE, position = "left")
complete_pool no_pool
RMSE 14.09 7.869

This sounds appealing if you’re interested in predictive performance, however given we must be careful when it comes to interpretation and make sure it answers our original goal.

  1. First of all, some of the group estimates were computed with very small sizes (e.g. \(n = 7\)) which can lead higher variance and out-of-sample RMSE (e.g. a new data point / outlier might change the trendline direction completely).
  2. Second, are reducing the sample size to the sample sizes per groups which might lead to Type 1 error when performing multiple comparisons.
  3. Most importantly, the model doesn’t account for relationships or correlations between the groups and doesn’t help us answer the original question of how age affects bounce times across counties.
# Plot no pool OLS fits per county
ggplot(bounce_data, aes(x=std_age, y=bounce_time, color=county)) +
  geom_point(alpha=0.5) +
  stat_smooth(method = "lm", se = FALSE) +
  facet_wrap(~county) +
  labs(x="Age (standardize)", y="Bounce Time",
       title="Kent and Essex's estimated trendlines rely on very few data points") +
  theme(legend.position = "none", 
        title = element_text(size=10))

Complete pooling to no pooling

So how does this look like in terms of the estimated slope and intercept parameters? The animation below shows how the intercept and std_age estimates change when we consider all points as part of one group (complete pool) vs. as independent groups (no pool).

Here we see an example of Simpson’s paradox where the slope std_age is way larger higher when grouping isn’t considered compared to when grouping is considered.

# Build df for anitmation, first with no pool coefs
pool_to_nopool <- tibble(model="no_pool",
                         intercept= coef(no_pool)[,1],
                         slope = coef(no_pool)[,2])

# Add complete pool coefs (needs to add repeats)
tmp_df <- tibble(model="complete_pool",
                 intercept=rep(coef(complete_pool)[1], nrow(pool_to_nopool)),
                 slope = rep(coef(complete_pool)[2], nrow(pool_to_nopool)))

pool_to_nopool <- bind_rows(pool_to_nopool, tmp_df)

# Animate
animate(ggplot(pool_to_nopool, aes(x =intercept, y=slope)) +
          geom_point(aes(color=model, group=1L)) +
          scale_color_discrete(name="", 
                             labels=c("Complete Pooling", "No Pooling")) +
          labs(x=expression(paste(hat(beta[0]), " (Intercept)")),
               y=expression(paste(hat(beta[1]), " (Standardized Age)")),
               title="In the no pooling model each county has its own slope and intercept") +
          theme_bw()+
          theme(legend.position="bottom",
                title= element_text(size=7)) +
          transition_states(model, transition_length = 1, state_length = 1,
                            wrap=FALSE) +
          shadow_wake(wake_length = 0.1, alpha = FALSE),
  res=120, width=700)

Partial-Pooling (Hierarchical / Mixed-Effects Model)

Here’s were taking into account the nested (hierarchical structure) can help mitigate some of the issues listed above and also helps us answer the initial question.

Here I considered random intercept and slopes per county as part of this model. (Note: you could consider models with either of these and see if it performs better than including both).

In the output below, we can see that most of the between-county variance is explain by the random intercept.

# Fit mixed effects model with varying slope and intercept
lmm <- lmer(bounce_time ~ 1 + std_age + (1 + std_age|county), data=bounce_data)

summary(lmm)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bounce_time ~ 1 + std_age + (1 + std_age | county)
##    Data: bounce_data
## 
## REML criterion at convergence: 4313
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1204 -0.6101  0.0601  0.6171  3.3248 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr
##  county   (Intercept) 195.77554 13.9920      
##           std_age       0.06781  0.2604  1.00
##  Residual              62.98119  7.9361      
## Number of obs: 613, groups:  county, 8
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 200.8082     4.9724  40.385
## std_age       1.3044     0.4787   2.725
## 
## Correlation of Fixed Effects:
##         (Intr)
## std_age 0.186 
## convergence code: 0
## boundary (singular) fit: see ?isSingular

Diagnostics

The fitted vs residual plots looks good with no trend, however it very similar to that of the no pooling model.

# Build diagnostic plot
partial_pool_df <- tibble(fit = fitted(lmm),
                     resid = residuals(lmm))

qplot(x= fit, y= resid, data = partial_pool_df, alpha=0.8) +
  geom_smooth(se = FALSE, size=1) +
  labs(y="Observed - Predicted", x= "Predicted", 
      title="No pooling model shows better fit (variance isn't heteroscedastic)") +
  theme(legend.position = "none")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Let’s compare how it compares in a Q-Q plot to check if the normality assumption is met. Here we expect sample to follow the theoretical quantiles of a normal (black line).

We see that the partial-pooling model (Mixed Effects) is slightly better in that it’s residuals are closer to the 1-1 line, especially on the tails compared to the no pool model.

# compare if partial pool and no pool residual distributions are normal
qq_df <- tibble(partial_resid = resid(lmm),
                no_pool_resid = resid(no_pool)) %>% 
  pivot_longer(cols = c("partial_resid", "no_pool_resid"),
              names_to="model", values_to="residuals")

ggplot(qq_df, aes(sample=residuals, color=model )) +
  stat_qq(alpha=0.5, size=3) + 
  stat_qq_line(color="black") +
  labs(x="Theoretical Quantiles", y="Sample Quantiles",
       title="Normal Q-Q plot: both models have nearly identical residual normal distributions") +
  theme(title=element_text(size=10))

Evaluation

The RMSE of the no pool is slightly smaller, so predictive performance doesn’t really improve with partial pooling in our scenario. However, we are now able to answer the initial questions which we weren’t able to answer with the no pool model.

# Generate predictions
lmm_yhat <- predict(lmm)

# Add partial pool RMSE
np_preds <- np_preds %>% 
  mutate(partial_pool = rmse(bounce_data$bounce_time, lmm_yhat))

# Compare to other models
kable(np_preds, digits = 3) %>% 
  row_spec(0, background = "#4CAF50", color="#FFF") %>% 
  kable_styling(full_width = FALSE, position = "left")
complete_pool no_pool partial_pool
14.09 7.869 7.878

No pooling to partial-pooling

So why is this happening and how does this look like?

It is happens because now we are assuming the groupings come from the same population instead of independent groups from distinct populations (no pool), and thus these share characteristics to some degree.

Here we can observe the shrinkage of estimates and standard errors (SE) for each of the random effects (i.e. intercept and slope per county).

However, it is important to note that fitting this model also led to singular fit warnings which indicate instability. (see shrinkage of std_age estimate tab for more info)

Note: it is important to remember that shrinkage might not be desired in some cases. For example, if our response is mortality rates in hospitals we might not want to “shrink” mortalities of smaller hospitals just because they have a smaller sample size. It might lead to erroneous interpretations.

Shrinkage std_age and intercept

Here we can observe how shrinkage changes between the complete pooling, no pooling and partial pooling models.

However, why do some county estimates exhibit larges changes (i.e. more displacement) between the no pool and partial pool models? (see next tab for answers)

pool_to_nopool <- bind_rows(pool_to_nopool,
                             tibble(model = 'partial_pool',
                                    intercept=coef(lmm)$county[,1],
                                    slope=coef(lmm)$county[,2]))

# Animate
animate(ggplot(pool_to_nopool, aes(x =intercept, y=slope)) +
          geom_point(aes(color=model, group=1L)) +
          scale_color_discrete(name="", 
                             labels=c("Complete Pooling", "No Pooling", 
                                      "Partial Pooling")) +
          labs(x=expression(paste(hat(beta[0]), " (Intercept)")),
               y=expression(paste(hat(beta[1]), " (Standardized Age)")),
               title="Effects on std_age and intercept estimates for various pooling models") +
          theme_bw()+
          theme(legend.position="bottom",
                title= element_text(size=7)) +
          transition_states(model, transition_length = 1, state_length = 1,
                            wrap=FALSE) +
          shadow_wake(wake_length = 0.2, alpha = FALSE),
  res=120, width=700)

Shrinkage of Intercept estimate

It happens because the partial pool parameter estimates for a particular group \(j\) are related to the group’s sample size \(n_j\), and the complete pool \(\bar{y}_{all}\) and unpooled \(\bar{y_j}\) estimates through a weighted average. It also includes population level variance \(\sigma^2\) and group variances \(\tau^2\)

\[\hat{\alpha}_j \approx \frac{\frac{n_j}{\sigma^2}}{\frac{n_j}{\sigma^2} + \frac{1}{\tau^2}}\bar{y}_j + \frac{ \frac{1}{\tau^2}}{\frac{n_j}{\sigma^2} + \frac{1}{\tau^2}}\bar{y}_{all}\]

Observe below how both the estimates and standard errors (SE) shrink when we fit the mixed effects model (partial pooling). The effect is stronger in groups with small sample sizes and less so in those with larger sample sizes.

This is why we say that estimates in hierarchical models “borrow strength”. Those with small sample sizes borrow more strength from those with larger samples (which makes sense because larger sample size -> more information -> more stable estimates)

# Collect all confidence intervals and estimates
ci_np <- confint(no_pool) # 95% confidence interval
ci_pp <- se.coef(lmm)$county

np_betas <- coef(no_pool) # estimates no pool
pp_betas <- coef(lmm)$county

# Prepare data for intercept shrinkage animation
county_n <- bounce_data %>% group_by(county) %>% count()
beta0_df <- tibble(model = "No Pool",
                  estimate = np_betas[,1],
                  lower = ci_np[,,"(Intercept)"][,1],
                  upper = ci_np[,,"(Intercept)"][,2])

beta0_pp <- tibble(model = "Partial Pool",
                 estimate = pp_betas[,1]) %>% 
  mutate(lower = estimate - 1.96*ci_pp[,1],
         upper = estimate + 1.96*ci_pp[,1])

beta0_df <- bind_rows(beta0_df, beta0_pp) %>% 
  mutate(n = rep(county_n$n, 2))


animate(ggplot(beta0_df, aes(x=n, y=estimate)) +
          geom_point(aes(color=model, group=1L), size= 2, alpha=0.4) +
          geom_errorbar(aes(ymin = lower, ymax = upper, color=model, group=1L),
                        width=6) +
          geom_hline(aes(yintercept =coef(complete_pool)[1]), 
                     color="black", alpha=0.4) +
          geom_text(aes(label ="Pop avg",
                         x= 10, y =coef(complete_pool)[1]-2),
                    size=3)+
          scale_color_discrete(name="", 
                             labels=c("No Pooling", "Partial Pooling")) +
          labs(y=expression(paste(hat(beta[0]), " (Intercept) Estimate")),
               x="Group sample size (n)",
               title="Partial-pooling srhinks no-pooling Intercept estimates and Std Errors") +
          theme_bw()+
          theme(legend.position="bottom",
                title= element_text(size=7)) +
          transition_states(model, transition_length = 0.7, state_length = 1,
                            wrap=FALSE),
        res=120, width=700, height=600)

Shrinkage of std_age estimate

Recall the singular fit message mentioned earlier in this document. The diagram below showcases the results of that warning which indicates a the model is too complex for the amount of data available in each group.

If we fit a model with no random slope we don’t get the singular fit error and our RMSE is nearly the same to the one including the random slope.

rintercept <- lmer(bounce_time ~ std_age + (1|county), data=bounce_data)

np_preds <- np_preds %>% 
  mutate(rintercept_only = rmse(bounce_data$bounce_time, 
                                predict(rintercept)))

# Table
kable(np_preds, digits = 3) %>% 
  row_spec(0, background = "#4CAF50", color="#FFF") %>% 
  kable_styling(full_width = FALSE, position = "left")
complete_pool no_pool partial_pool rintercept_only
14.09 7.869 7.878 7.88

This means that the random intercept estimate takes into account most of the between-county variability. Thus, in simple terms, when we attempt to estimate the random slope with the remaining variance from sample sampel sizes, the model can’t produce stable estimates using this remaining info. Therefore, we observe a slight shrinkage of the estimates but extreme for the standard errors.

Here is a situation where a Bayesian framework can assist and provide stable estimates.

beta1_df <- tibble(model = "No Pool",
                  estimate = np_betas[,2],
                  lower = ci_np[,,"std_age"][,1],
                  upper = ci_np[,,"std_age"][,2])

beta1_pp <- tibble(model = "Partial Pool",
                 estimate = pp_betas[,2]) %>% 
  mutate(lower = estimate - 1.96*ci_pp[,2],
         upper = estimate + 1.96*ci_pp[,2])

beta1_df <- bind_rows(beta1_df, beta1_pp) %>% 
  mutate(n = rep(county_n$n, 2))


animate(ggplot(beta1_df, aes(x=n, y=estimate)) +
          geom_point(aes(color=model, group=1L), size=2, alpha=0.4) +
          geom_errorbar(aes(ymin = lower, ymax = upper, color=model, group=1L),
                        width= 6) +
          geom_hline(aes(yintercept =coef(complete_pool)[2]), 
                     color="black", alpha=0.4) +
          geom_text(aes(label ="Pop avg",
                         x= 10, y =coef(complete_pool)[2]+0.5),
                    size=3)+
          scale_color_discrete(name="", 
                             labels=c("No Pooling", "Partial Pooling")) +
          labs(y=expression(paste(hat(beta[1]), " (std_age) Estimate")),
               x="Group sample size (n)",
               title="Partial-pooling shrinks no-pooling slope (std_age) estimates and Std Errors") +
          theme_bw()+
          theme(legend.position="bottom",
                title= element_text(size=7)) +
          transition_states(model, transition_length = 0.7, state_length = 1,
                            wrap=FALSE),
        res=120, width=700, height=600)

Bayesian Mixed-Effects Models

Here we will use the brms library which, given it uses a similar syntax as the lme4 functions above, it makes it very simple to implement a Bayesian hierarchical model using stan.

Selecting a prior

We quickly do some prior predictive checks with simulated data to choose some weakly informative priors

# Simulate 
y_sim <- rep(0,nrow(bounce_data))
for (i in 1:nrow(bounce_data)){
  sigma <- rhcauchy(1, sigma = 1)
  mu <- rnorm(1, mean=200, sd=100) + (rnorm(1, mean=1, sd=1)*bounce_data$std_age[i])
  y_sim[i] <- rnorm(1, mean=mu, sd=sigma)
}

tibble(bounce_time = bounce_data$bounce_time, sim_data = y_sim) %>% 
  ggplot(., aes(x=bounce_time, y=sim_data)) +
  geom_point() +
  labs(x="Empirical bounce times", y="Simulated data")

Modeling

We can now fit the brms model and obtain estimates for the county intercepts and slopes with associated 95% Credible Intervals. Note that the credible interval for the std_age variable are more stable instead of vanishing.

bayes_lmm <- brm(bounce_time ~ 1 + std_age + (1 + std_age|county), 
                 data=bounce_data,
                 family=gaussian(),
                 prior= c(prior(normal(0, 10), class = Intercept),
                          prior(normal(0, 10), class = b),
                          prior(cauchy(0, 1), class = sigma),
                          prior(cauchy(0, 1), class = sd)),
                 warmup = 2000, 
                 iter = 5000, 
                 chains = 2, control = list(adapt_delta = 0.95))
## Compiling the C++ model
## Start sampling
## 
## SAMPLING FOR MODEL 'fa4d33dd312da9c9720bb5268f8a05e0' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000205 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 2.05 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 1: Iteration:  500 / 5000 [ 10%]  (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1500 / 5000 [ 30%]  (Warmup)
## Chain 1: Iteration: 2000 / 5000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2001 / 5000 [ 40%]  (Sampling)
## Chain 1: Iteration: 2500 / 5000 [ 50%]  (Sampling)
## Chain 1: Iteration: 3000 / 5000 [ 60%]  (Sampling)
## Chain 1: Iteration: 3500 / 5000 [ 70%]  (Sampling)
## Chain 1: Iteration: 4000 / 5000 [ 80%]  (Sampling)
## Chain 1: Iteration: 4500 / 5000 [ 90%]  (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 81.0944 seconds (Warm-up)
## Chain 1:                134.03 seconds (Sampling)
## Chain 1:                215.125 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'fa4d33dd312da9c9720bb5268f8a05e0' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 6.8e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.68 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 2: Iteration:  500 / 5000 [ 10%]  (Warmup)
## Chain 2: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 2: Iteration: 1500 / 5000 [ 30%]  (Warmup)
## Chain 2: Iteration: 2000 / 5000 [ 40%]  (Warmup)
## Chain 2: Iteration: 2001 / 5000 [ 40%]  (Sampling)
## Chain 2: Iteration: 2500 / 5000 [ 50%]  (Sampling)
## Chain 2: Iteration: 3000 / 5000 [ 60%]  (Sampling)
## Chain 2: Iteration: 3500 / 5000 [ 70%]  (Sampling)
## Chain 2: Iteration: 4000 / 5000 [ 80%]  (Sampling)
## Chain 2: Iteration: 4500 / 5000 [ 90%]  (Sampling)
## Chain 2: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 92.8436 seconds (Warm-up)
## Chain 2:                167.796 seconds (Sampling)
## Chain 2:                260.64 seconds (Total)
## Chain 2:
## Warning: There were 3556 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## http://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
coef(bayes_lmm)
## $county
## , , Intercept
## 
##          Estimate Est.Error     Q2.5    Q97.5
## cheshire 212.1214 0.7119139 210.7442 213.5155
## cumbria  208.1579 0.8233143 206.5963 209.8134
## devon    197.8248 1.1571974 195.5084 200.1033
## dorset   200.9539 0.9344757 199.1088 202.8154
## essex    208.1383 1.7382546 204.7986 211.4762
## kent     219.1399 3.0462198 213.0848 225.0882
## london   180.4238 1.4930177 177.5210 183.2920
## norfolk  181.0243 0.8791062 179.2814 182.7416
## 
## , , std_age
## 
##          Estimate Est.Error        Q2.5    Q97.5
## cheshire 1.275370 0.5829055  0.13853592 2.436951
## cumbria  1.230252 0.6049861  0.03712182 2.408146
## devon    1.258642 0.6291188  0.03379872 2.535885
## dorset   1.342208 0.6152464  0.21480077 2.632605
## essex    1.183787 0.7310108 -0.36798154 2.578821
## kent     1.231249 0.7445650 -0.30494908 2.703890
## london   1.186572 0.6967927 -0.25597282 2.539004
## norfolk  1.147106 0.6304039 -0.18179507 2.289621

Diagnostics

Here we can make use of the bayesplot package

Posterior predictive distribution bounce_time

color_scheme_set("red")
ppc_dens_overlay(y = bounce_data$bounce_time,
                 yrep = posterior_predict(bayes_lmm, nsamples = 50)) +
  labs(x="Bounce time (secs)") +
  panel_bg(fill = "gray95", color = NA) +
  grid_lines(color = "white")

Posterior predictive checks per county

color_scheme_set("brightblue")
bayes_lmm %>%
  posterior_predict(nsamples = 500) %>%
  ppc_stat_grouped(y = bounce_data$bounce_time,
                   group = bounce_data$county,
                   stat = "mean") +
  labs(x= "Bounce times (ms)") +
  panel_bg(fill = "gray95", color = NA) +
  grid_lines(color = "white")

Posterior predictive checks: MCMC Divergence

color_scheme_set("darkgray")
mcmc_scatter(
  as.matrix(bayes_lmm),
  pars = c("sd_county__Intercept",
           "r_county[london,Intercept]"),
  np = nuts_params(bayes_lmm),
  np_style = scatter_style_np(div_color = "green", div_alpha = 0.8)
) +
  labs(x = "Standard Dev of County X Intercept",
       y= "Intercept of County",
       titles = "(No green dots means no divergence, thus good mixing and non-curvy posterior)")

Evaluation

Same performance, but let’s look at what happens to the std_age estimates and errors when we use a Bayesian framework

y_hat_bayes <- colMeans(posterior_predict(bayes_lmm,
                                         nsamples=1000))

# Add Bayes RMSE
np_preds <- np_preds %>% 
  mutate(bayes = rmse(bounce_data$bounce_time, y_hat_bayes))

# Compare to other models
kable(np_preds, digits = 3) %>% 
  row_spec(0, background = "#4CAF50", color="#FFF") %>% 
  kable_styling(full_width = FALSE, position = "left")
complete_pool no_pool partial_pool rintercept_only bayes
14.09 7.869 7.878 7.88 7.863

Shrinkage of std_age estimate (Bayes)

bayes_beta1 <- coef(bayes_lmm)$county[,,"std_age"] %>% 
  data.frame() %>% 
  dplyr::select(-Est.Error) %>% 
  mutate(model ="PP Bayes",
         n=county_n$n)

colnames(bayes_beta1) <- c("estimate", "lower", "upper",
                           "model", "n")

beta1_df <- bind_rows(beta1_df, bayes_beta1)


animate(ggplot(beta1_df, aes(x=n, y=estimate)) +
          geom_point(aes(color=model, group=1L), size=2, alpha=0.4) +
          geom_errorbar(aes(ymin = lower, ymax = upper, color=model, group=1L),
                        width= 6) +
           geom_hline(aes(yintercept =coef(complete_pool)[2]), 
                     color="black", alpha=0.4) +
          geom_text(aes(label ="Pop avg",
                         x= 10, y =coef(complete_pool)[2]+0.5),
                    size=3) +
          scale_color_discrete(name="", 
                             labels=c("No pool", "Freq PP", "Bayes PP")) +
          labs(y=expression(paste(hat(beta[1]), " (std_age) Estimate")),
               x="Group sample size (n)",
               title="Bayesian Partial-Pooling doesn't shrink Std Errors as much as the Frequentist PP model ") +
          theme_bw()+
          theme(legend.position="bottom",
                title= element_text(size=7)) +
          transition_states(model, transition_length = 0.8, state_length = 1.2,
                            wrap=FALSE),
        res=120, width=700, height=600)

Bayesian linear fits

Here we see that for counties with more samples the predicted linear trends are very close together (we are more certain of those ones). In comparison, the trends in counties with small sample show lots of uncertainty (i.e. larger estimation errors).

add_fitted_draws(model=bayes_lmm, newdata=bounce_data, n = 100) %>%        
  ggplot(aes(x = std_age, y = bounce_time, color=county) ) +
  geom_point() +
  geom_line(aes(y = .value, group = .draw), alpha = 1/15, color = "#08519C") +
  facet_wrap(~county, scales="free") +
  theme(legend.position = "none")